# lets import libraries
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
# load data
e_commerce_df = pd.read_csv(r"C:\Users\jki\Downloads\ecommerce_customer_data.csv")
e_commerce_df.head(5)
| User_ID | Gender | Age | Location | Device_Type | Product_Browsing_Time | Total_Pages_Viewed | Items_Added_to_Cart | Total_Purchases | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Female | 23 | Ahmedabad | Mobile | 60 | 30 | 1 | 0 |
| 1 | 2 | Male | 25 | Kolkata | Tablet | 30 | 38 | 9 | 4 |
| 2 | 3 | Male | 32 | Bangalore | Desktop | 37 | 13 | 5 | 0 |
| 3 | 4 | Male | 35 | Delhi | Mobile | 7 | 20 | 10 | 3 |
| 4 | 5 | Male | 27 | Bangalore | Tablet | 35 | 20 | 8 | 2 |
# Lets chech for missing values
missing_values = e_commerce_df.isna().sum()
print(missing_values)
User_ID 0 Gender 0 Age 0 Location 0 Device_Type 0 Product_Browsing_Time 0 Total_Pages_Viewed 0 Items_Added_to_Cart 0 Total_Purchases 0 dtype: int64
# let check at the statistsical summary
e_commerce_df.describe()
| User_ID | Age | Product_Browsing_Time | Total_Pages_Viewed | Items_Added_to_Cart | Total_Purchases | |
|---|---|---|---|---|---|---|
| count | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 | 500.000000 |
| mean | 250.500000 | 26.276000 | 30.740000 | 27.182000 | 5.150000 | 2.464000 |
| std | 144.481833 | 5.114699 | 15.934246 | 13.071596 | 3.203127 | 1.740909 |
| min | 1.000000 | 18.000000 | 5.000000 | 5.000000 | 0.000000 | 0.000000 |
| 25% | 125.750000 | 22.000000 | 16.000000 | 16.000000 | 2.000000 | 1.000000 |
| 50% | 250.500000 | 26.000000 | 31.000000 | 27.000000 | 5.000000 | 2.000000 |
| 75% | 375.250000 | 31.000000 | 44.000000 | 38.000000 | 8.000000 | 4.000000 |
| max | 500.000000 | 35.000000 | 60.000000 | 50.000000 | 10.000000 | 5.000000 |
# lets have a look at non-numeric column
e_commerce_df.describe(include='object')
print(e_commerce_df)
User_ID Gender Age Location Device_Type Product_Browsing_Time \
0 1 Female 23 Ahmedabad Mobile 60
1 2 Male 25 Kolkata Tablet 30
2 3 Male 32 Bangalore Desktop 37
3 4 Male 35 Delhi Mobile 7
4 5 Male 27 Bangalore Tablet 35
.. ... ... ... ... ... ...
495 496 Male 24 Mumbai Tablet 40
496 497 Female 24 Hyderabad Desktop 8
497 498 Male 19 Pune Desktop 33
498 499 Male 28 Chennai Desktop 6
499 500 Female 31 Kolkata Mobile 16
Total_Pages_Viewed Items_Added_to_Cart Total_Purchases
0 30 1 0
1 38 9 4
2 13 5 0
3 20 10 3
4 20 8 2
.. ... ... ...
495 45 0 0
496 34 4 5
497 41 0 3
498 32 7 4
499 10 10 4
[500 rows x 9 columns]
Now, let’s have a look at the distribution of age in the dataset
# Histogram Age
fig = px.histogram(e_commerce_df, x='Age', title='Distribution of Age')
fig.show()
Now, let’s have a look at the gender distribution:
# Bar chart for 'Gender'
gender_counts = e_commerce_df['Gender'].value_counts().reset_index()
gender_counts.columns = ['Gender', 'Count']
fig = px.bar(gender_counts, x='Gender',
y='Count',
title='Gender Distribution')
fig.show()
# 'Product_Browsing_Time' vs 'Total_Pages_Viewed'
fig = px.scatter(e_commerce_df, x='Product_Browsing_Time', y='Total_Pages_Viewed',
title='Product Browsing Time vs. Total Pages Viewed',
trendline='ols')
fig.show()
The above scatter plot shows no consistent pattern or strong association between the time spent browsing products and the total number of pages viewed. It indicates that customers are not necessarily exploring more pages if they spend more time on the website, which might be due to various factors such as the website design, content relevance, or individual user preferences.
# Grouped Analysis
gender_grouped = e_commerce_df.groupby('Gender')['Total_Pages_Viewed'].mean().reset_index()
gender_grouped.columns = ['Gender', 'Average_Total_Pages_Viewed']
fig = px.bar(gender_grouped, x='Gender', y='Average_Total_Pages_Viewed',
title='Average Total Pages Viewed by Gender')
fig.show()
devices_grouped = e_commerce_df.groupby('Device_Type')['Total_Pages_Viewed'].mean().reset_index()
devices_grouped.columns = ['Device_Type', 'Average_Total_Pages_Viewed']
fig = px.bar(devices_grouped, x='Device_Type', y='Average_Total_Pages_Viewed',
title='Average Total Pages Viewed by Devices')
fig.show()
e_commerce_df['CLV'] = (e_commerce_df['Total_Purchases'] * e_commerce_df['Total_Pages_Viewed']) / e_commerce_df['Age']
e_commerce_df['Segment'] = pd.cut(e_commerce_df['CLV'], bins=[1, 2.5, 5, float('inf')],
labels=['Low Value', 'Medium Value', 'High Value'])
segment_counts = e_commerce_df['Segment'].value_counts().reset_index()
segment_counts.columns = ['Segment', 'Count']
# Create a bar chart to visualize the customer segments
fig = px.bar(segment_counts, x='Segment', y='Count',
title='Customer Segmentation by CLV')
fig.update_xaxes(title='Segment')
fig.update_yaxes(title='Number of Customers')
fig.show()
# Funnel analysis
funnel_data = e_commerce_df[['Product_Browsing_Time', 'Items_Added_to_Cart', 'Total_Purchases']]
funnel_data = funnel_data.groupby(['Product_Browsing_Time', 'Items_Added_to_Cart']).sum().reset_index()
fig = px.funnel(funnel_data, x='Product_Browsing_Time', y='Items_Added_to_Cart', title='Conversion Funnel')
fig.show()
In the above graph, the x-axis represents the time customers spend browsing products on the e-commerce platform. The y-axis represents the number of items added to the shopping cart by customers during their browsing sessions.
# Calculate churn rate
e_commerce_df['Churned'] = e_commerce_df['Total_Purchases'] == 0
churn_rate = e_commerce_df['Churned'].mean()
print(churn_rate)
0.198
A churn rate of 0.198 indicates that a significant portion of customers has churned, and addressing this churn is important for maintaining business growth and profitability.